#
Dr. M. Baron, Statistical Machine Learning class, STAT-427/627
# APPLICATIONS
1. Matrix computation for Google’s
PageRank algorithm
How to solve the matrix equation and obtain Google solution for the
textbook example
(Elements of Statistical Learning, pp. 576-578)
> x=c(0,1,1,0,0,0,1,0,1,0,0,0,0,0,1,0)
> L = matrix(x,4,4) # Define a 4x4 matrix L showing linkage
among web pages
> L # R
reads a matrix column by column
[,1]
[,2] [,3] [,4]
[1,]
0 0 1
0
[2,]
1 0
0 0
[3,]
1 1 0
1
[4,]
0 0 0
0
> e = matrix( rep(1,4), 4, 1 ) # Vector of 1s
> e
[,1]
[1,] 1
[2,] 1
[3,] 1
[4,] 1
> C = diag( colSums(L) ) # Diagonal matrix C showing the number
> C # of outgoing links from each web page
[,1]
[,2] [,3] [,4]
[1,]
2 0 0
0 # Matrix multiplication is %*%
[2,]
0 1 0
0 # Inverse matrix is solve(C)
[3,]
0 0 1
0 # Transpose matrix is t(e)
[4,]
0 0 0
1
> A = 0.15*(e%*%t(e))/4 + 0.85*L%*%solve(C)
> A
[,1] [,2] [,3]
[,4]
[1,] 0.0375 0.0375 0.8875 0.0375
[2,] 0.4625 0.0375 0.0375 0.0375
[3,] 0.4625 0.8875 0.0375 0.8875
[4,] 0.0375 0.0375 0.0375 0.0375
> eigen(A) # Computes eigenvalues and eigenvectors of
A
$values
[1]
1.000000e+00+0.00e+00i -4.250000e-01+4.25e-01i
[3] -4.250000e-01-4.25e-01i 5.477659e-18+0.00e+00i
$vectors
[,1] [,2]
[1,] -0.64470397+0i 7.071068e-01+0.000000e+00i
[2,] -0.33889759+0i -3.535534e-01-3.535534e-01i
[3,] -0.68212419+0i -3.535534e-01+3.535534e-01i
[4,] -0.06489841+0i -2.175463e-17-4.017163e-18i
[,3] [,4]
[1,]
7.071068e-01+0.000000e+00i -9.941546e-16+0i
[2,] -3.535534e-01+3.535534e-01i
-7.071068e-01+0i
[3,] -3.535534e-01-3.535534e-01i
-1.968623e-16+0i
[4,] -2.175463e-17+4.017163e-18i 7.071068e-01+0i
> install.packages("expm”) # This package contains matrix powers
> library(expm) #
“Exponentiation of a matrix”
> A100 = A%^%100 # Large power of matrix A
> A100 # (a
simple but not an optimal method)
[,1] [,2] [,3]
[,4]
[1,] 0.3725269 0.3725269 0.3725269 0.3725269
[2,] 0.1958239 0.1958239 0.1958239 0.1958239
[3,] 0.3941492 0.3941492 0.3941492 0.3941492
[4,] 0.0375000 0.0375000 0.0375000 0.0375000
#
“Converged”! All columns are the same
> p = A100[,1]
# Take one column of A100
> p
[1] 0.3725269 0.1958239 0.3941492 0.0375000
> p=t(t(p)) #
Now R understands it as a column
> p
[,1]
[1,] 0.3725269
[2,] 0.1958239
[3,] 0.3941492
[4,] 0.0375000
#
Verify that it is a solution of p = Ap
> A%*%p
[,1]
[1,] 0.3725269
[2,] 0.1958239
[3,] 0.3941492
[4,] 0.0375000
# p is not properly normalized yet, we need
e’p = N = 4
> t(e)%*%p # but we have the sum of p[i] = 1
[,1]
[1,] 1
> p*4 # This is the solution. Ranked by
importance, we have
[,1] # web pages ordered as 3, 1, 2, 1
[1,] 1.4901074
[2,] 0.7832956
[3,] 1.5765969
[4,] 0.1500000
# Another way is by iterations (standard method of solving
fixed-point equations):
> p = matrix(c(1,0,0,0),4,1) #
The initial vector is kind of arbitrary
> p
[,1]
[1,] 1
[2,] 0
[3,] 0
[4,] 0
> for (n in (1:100)){ p = A%*%p } #
Iterate p[next] = A*p[current]
> 4*p # Same result
[,1]
[1,] 1.4901074
[2,] 0.7832956
[3,] 1.5765969
[4,] 0.1500000
2.
Spam detection
#
The Spam data with description are in https://archive.ics.uci.edu/ml/datasets/Spambase
#
We can also get its formatted version from a package “kernlab”
> library(kernlab)
> data(spam)
> dim(spam)
[1] 4601
58
> names(spam)
[1] "make" "address" "all" "num3d"
[5] "our" "over" "remove" "internet"
[9] "order" "mail" "receive" "will"
[13]
"people"
"report"
"addresses"
"free"
[17]
"business"
"email"
"you"
"credit"
[21]
"your"
"font"
"num000"
"money"
[25]
"hp" "hpl"
"george" "num650"
[29]
"lab"
"labs"
"telnet"
"num857"
[33]
"data"
"num415"
"num85"
"technology"
[37]
"num1999"
"parts"
"pm"
"direct"
[41]
"cs"
"meeting"
"original"
"project"
[45]
"re" "edu"
"table"
"conference"
[49]
"charSemicolon" "charRoundbracket" "charSquarebracket"
"charExclamation"
[53]
"charDollar" "charHash" "capitalAve" "capitalLong"
[57]
"capitalTotal" "type"
> table(type)
type
nonspam spam
2788 1813
#
LOGISTIC REGRESSION
> logreg = glm( type ~ ., data=spam, family = "binomial" )
> prob.spam = fitted.values(logreg)
> class.spam = ifelse( prob.spam > 0.5,
"spam", "nonspam" )
> table(class.spam,
type)
type
class.spam
nonspam spam
nonspam 2666
194
spam
122 1619
> mean( type == class.spam
)
[1] 0.9313193 #
Correct classification rate = 93% (better be validated by CV)
#
What are error rates among spam and benign emails?
> sum( type=="nonspam"
& class.spam=="spam" ) / sum(
type=="nonspam" )
[1] 0.04375897
> sum( type=="spam" & class.spam=="nonspam" )
/ sum( type=="spam" )
[1] 0.107005
#
Reducing the error rate for nonspam emails, with loss
function L(1,0)=1, L(0,1)=10
> class.spam = ifelse( prob.spam > 10/11,
"spam", "nonspam" )
> table(class.spam,
type)
type
class.spam
nonspam spam
nonspam 2759
674
spam
29 1139
> mean( type == class.spam
)
[1] 0.8472071
> sum( type=="nonspam"
& class.spam=="spam" ) / sum(
type=="nonspam" )
[1] 0.01040172
> sum( type=="spam" & class.spam=="nonspam" )
/ sum( type=="spam" )
[1] 0.3717595
#
The overall error rate increased, and the error rate of misclassifying spam
emails as nonspam increased substantially, but the
error rate for benign emails is only 1% now.
# DECISION TREES
> tr = tree( type ~ ., data=spam )
> plot(tr); text(tr);
> summary(tr)
Misclassification error rate: 0.08259 = 380 / 4601
# Optimization – random forests
> library(randomForest)
> rf = randomForest( type ~ ., data=spam )
> varImpPlot(rf) # To see which variables are most
significant in a tree based classification
> rf
Number of trees: 500
No. of variables tried at each
split: 7 # That’s approx. square root of the number
of X-variables
OOB estimate of error rate: 4.52% #
The lowest classification rate we could get so far
Confusion
matrix:
nonspam spam class.error
nonspam 2710 78 0.02797704
spam 130 1683 0.07170436
# DISCRIMINANT ANALYSIS
> library(MASS)
> LDA = lda( type ~ ., data=spam, CV=TRUE )
> table( type, LDA$class )
type nonspam spam
nonspam 2656
132
spam
394 1419
> mean( type != LDA$class )
[1] 0.114323 # This error rate is higher
> QDA = qda( type ~ ., data=spam, CV=TRUE )
> table( type, QDA$class )
type nonspam spam
nonspam 2090
691
spam
87 1722
> mean( type != QDA$class )
[1] NA
> summary(QDA$class)
nonspam spam NA's
2177 2413 11 # There are missing values, predictions not
computed by QDA
#
We’ll remove these points
> mean( type != QDA$class, na.rm=TRUE )
[1] 0.1694989 # No improvement with QDA; linear models
are ok
SUPPORT VECTOR MACHINES
> library(e1071)
> SVM = svm( type ~ ., data=spam, kernel="linear" )
> summary(SVM)
Number of Support Vectors: 940
( 456 484 ) # 940 support vectors (not cleanly
classified) out of 4601 data points
> dim(spam)
[1] 4601 58
> class = predict(SVM)
> table(class,type)
type
class nonspam spam
nonspam 2663
185
spam
125 1628
> mean( class != type )
[1] 0.06737666 # Best rate so far?